Introduction

This script contains two analyses performed using GPT. One analysis is split into two parts, one focused on valence, the other one focused on change/improve verbs.

For the valence analysis, we used the test frames:

The word “add” is a positive word. The word “add” is a negative word.

… and so on, for each addition- or subtraction-related word from our list (Table 1). We then look at the log probability of the words “positive” and “negative” in this context.

For the change/improve verb analysis, we used the test frames:

I suggest we change this by adding. I suggest we change this by subtracting. I suggest we change this by removing.

… and so on, for each verb that is a direct synonym of to change or each verb that is a direct synonym of to improve. We then look at the log probability of the adding/subtracting/removing case.

Setup

Load packages:

library(tidyverse) # for data processing
library(brms) # for bayesian models
library(tidybayes) # for half-eye plots
library(effsize) # for cohen's d
library(patchwork) # for double plots

Load both datasets:

# Cloze probability for change verbs:

suggest <- read_csv('../data/GPT3_close_probabilities.csv')

# Valence cloze probability:

good_bad <- read_csv('../data/GPT3_good_bad_connotation_cloze_probability.csv')

Show both:

suggest
## # A tibble: 60 × 7
##    text        verb  synonym_set cloze_verb verb_prob verb_logprob response_text
##    <chr>       <chr> <chr>       <chr>          <dbl>        <dbl> <chr>        
##  1 I suggest … chan… change      adding          7.53        -2.59 <NA>         
##  2 I suggest … chan… change      subtracti…      0.01        -9.51 <NA>         
##  3 I suggest … chan… change      removing        1.77        -4.03 <NA>         
##  4 I suggest … adju… change      adding         11.1         -2.2  <NA>         
##  5 I suggest … adju… change      subtracti…      0.23        -6.08 <NA>         
##  6 I suggest … adju… change      removing        2           -3.91 <NA>         
##  7 I suggest … conv… change      adding          1.09        -4.51 <NA>         
##  8 I suggest … conv… change      subtracti…      0.08        -7.16 <NA>         
##  9 I suggest … conv… change      removing        1.12        -4.5  <NA>         
## 10 I suggest … mode… change      adding          4.94        -3.01 <NA>         
## # … with 50 more rows
good_bad
## # A tibble: 28 × 7
##    text       context word   pair    type  valence_word_lo… completion_text     
##    <chr>      <chr>   <chr>  <chr>   <chr>            <dbl> <chr>               
##  1 "The word… good    add    add/su… add              -7.61  <NA>               
##  2 "The word… bad     add    add/su… add              -7.51  <NA>               
##  3 "The word… good    subtr… add/su… subt…            -9.07  <NA>               
##  4 "The word… bad     subtr… add/su… subt…            -8.14  <NA>               
##  5 "The word… good    addit… additi… add              -8.17  <NA>               
##  6 "The word… bad     addit… additi… add              -9.72  <NA>               
##  7 "The word… good    subtr… additi… subt…           -10.7   <NA>               
##  8 "The word… bad     subtr… additi… subt…            -9.76 "The word \"subtrac…
##  9 "The word… good    plus   plus/m… add              -8.27  <NA>               
## 10 "The word… bad     plus   plus/m… add              -9.93  <NA>               
## # … with 18 more rows

Valence analysis

Can’t use annotation with facets, so will have to use geoms. For this, setup a tibble with the relevant data:

# Extract y-points to plot:

bad_points <- good_bad %>% 
  filter(type == 'subtract', context == 'bad') %>%
  pull(valence_word_logprob)
good_points <- good_bad %>%
  filter(type == 'subtract', context == 'good') %>%
  pull(valence_word_logprob)

# Extract labels to plot:

bad_words <- good_bad %>% 
  filter(type == 'subtract', context == 'bad') %>%
  pull(pair)
good_words <- good_bad %>%
  filter(type == 'subtract', context == 'good') %>%
  pull(pair)

# Put into tibble:

text_df <- tibble(valence_word_logprob = c(bad_points, good_points),
                  context = rep(c('bad', 'good'), each = length(bad_points)),
                  type = rep('subtract', length(bad_points) * 2),
                  pair = c(bad_words, good_words),
                  x_pos = 2.1)

Hand-adjust overlapping values:

# Increase/decrease up and add/subtract down in bad panel:

row_id <- which(text_df$pair == 'increase/decrease' & text_df$context == 'bad')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] + 0.095

row_id <- which(text_df$pair == 'add/subtract' & text_df$context == 'bad')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] - 0.055

# More/less up and plus/minus down in bad panel:

row_id <- which(text_df$pair == 'more/less' & text_df$context == 'bad')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] + 0.1

row_id <- which(text_df$pair == 'plus/minus' & text_df$context == 'bad')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] - 0.04

# Plus/minus and increase/decrease up and add/subtract down in good panel:

row_id <- which(text_df$pair == 'plus/minus' & text_df$context == 'good')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] + 0.03

row_id <- which(text_df$pair == 'increase/decrease' & text_df$context == 'good')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] + 0.04

row_id <- which(text_df$pair == 'add/subtract' & text_df$context == 'good')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] - 0.08

# Most/least up in good panel:

row_id <- which(text_df$pair == 'most/least' & text_df$context == 'good')
text_df[row_id, 'valence_word_logprob'] <- text_df[row_id, 'valence_word_logprob'] + 0.045

Make the plot:

# Define plot and aesthetics:

good_bad_p <- good_bad %>% 
  ggplot(aes(y = valence_word_logprob, x = type, group = pair, fill = type))

# Add geoms:

good_bad_p <- good_bad_p +
  geom_line(col = 'grey') +
  geom_point(size = 3, shape = 21, alpha = 0.85) +
  geom_text(data = text_df,
            mapping = aes(x = x_pos,
                          y = valence_word_logprob,
                          label = pair),
            hjust = 0, size = 3) +
  facet_wrap(~context)

# Add scales and axes labels:

good_bad_p <- good_bad_p +
  scale_fill_manual(values = c("#E69F00", "#0072B2")) +
  scale_y_continuous(breaks = seq(-12, -6, 1)) +
  coord_cartesian(xlim = c(1.3, 2.8), ylim = c(-11, -6.5),
                  clip = 'off') +
  xlab(NULL) +
  ylab('Log probability (GPT3)')

# Add themes:

good_bad_p <- good_bad_p +
  theme_classic() +
  theme(legend.position = 'none') +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 15,
                                                    b = 0, l = 0),
                                    size = 14, face = 'bold'),
        axis.text.x = element_text(face = 'bold', size = 10),
        plot.margin = margin(r = 5, l = 5, b = 5, t = 5))

# Show and save:

good_bad_p

ggsave(plot = good_bad_p,
       filename = '../figures/pdf/GPT_good_bad.pdf',
       width = 5.5, height = 3.7)
ggsave(plot = good_bad_p,
       filename = '../figures/png/GPT_good_bad.png',
       width = 5.5, height = 3.7)
ggsave(plot = good_bad_p,
       filename = '../figures/tiff/GPT_good_bad.tiff',
       width = 5.5, height = 3.7)

Descriptive averages:

good_bad %>% 
  group_by(context, type) %>% 
  summarize(M = mean(valence_word_logprob)) %>% 
  mutate(prob = exp(M))
## `summarise()` has grouped output by 'context'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 4
## # Groups:   context [2]
##   context type         M      prob
##   <chr>   <chr>    <dbl>     <dbl>
## 1 bad     add      -9.31 0.0000904
## 2 bad     subtract -8.93 0.000132 
## 3 good    add      -7.63 0.000488 
## 4 good    subtract -8.70 0.000166

Effect size of difference in negative, and in positive seperately:

cohen.d(valence_word_logprob ~ type | Subject(pair),
        data = filter(good_bad, context == 'good'), paired = TRUE)
## 
## Cohen's d
## 
## d estimate: 1.105919 (large)
## 95 percent confidence interval:
##      lower      upper 
## 0.09102965 2.12080865
cohen.d(valence_word_logprob ~ type | Subject(pair),
        data = filter(good_bad, context == 'bad'), paired = TRUE)
## 
## Cohen's d
## 
## d estimate: -0.4808853 (small)
## 95 percent confidence interval:
##      lower      upper 
## -1.2318781  0.2701075

Model valence

Sum-code the type and context predictors:

good_bad <- mutate(good_bad,
                   context_c = factor(context, levels = c('bad', 'good')),
                   type_c = factor(type, levels = c('add', 'subtract')))

We set bad and add as the reference level so that we get a negative interaction coefficient, which is easier to talk about with respect to the figure, which shows a big drop of acceptability for the subtraction-related words in the good context.

Set contrasts:

contrasts(good_bad$context_c) <- contr.sum(2)
contrasts(good_bad$type_c) <- contr.sum(2)

Model the word valence:

good_bad_mdl <- brm(valence_word_logprob ~ 1 + 
                      type_c + context_c + type_c:context_c +
                      (1|pair),
                    
                    data = good_bad,
                    family = gaussian,

                    # MCMC settings:

                    seed = 666, chains = 4, cores = 4,
                    warmup = 4000, iter = 6000,
                    control = list(adapt_delta = 0.99))
## Compiling Stan program...
## Start sampling

Save the model:

save(good_bad_mdl, file = '../models/good_bad_mdl.RData')

Alternatively, load the model (to save time):

load('../models/good_bad_mdl.RData')

Perform posterior predictive checks (with empirical cumulative distribution function):

pp_check(good_bad_mdl,
         ndraws = 100)

Check the model:

good_bad_mdl
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: valence_word_logprob ~ 1 + type_c + context_c + type_c:context_c + (1 | pair) 
##    Data: good_bad (Number of observations: 28) 
##   Draws: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
##          total post-warmup draws = 8000
## 
## Group-Level Effects: 
## ~pair (Number of levels: 7) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.48      0.30     0.04     1.19 1.00     1856     2547
## 
## Population-Level Effects: 
##                    Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept             -8.65      0.25    -9.15    -8.15 1.00     2715     2971
## type_c1                0.18      0.15    -0.12     0.46 1.00     7722     4778
## context_c1            -0.48      0.15    -0.76    -0.19 1.00     7307     5425
## type_c1:context_c1    -0.37      0.15    -0.66    -0.07 1.00     7787     5384
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.77      0.13     0.56     1.08 1.00     3416     5466
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Perform a test of the posterior probability of the interaction effect being positive.

hypothesis(good_bad_mdl, 'type_c1:context_c1 > 0')
## Hypothesis Tests for class b:
##                 Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio
## 1 (type_c1:context_c1) > 0    -0.37      0.15    -0.61    -0.12       0.01
##   Post.Prob Star
## 1      0.01     
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.

Do this by hand by extracting posterior samples:

good_bad_samples <- posterior_samples(good_bad_mdl) %>% 
  rename(interaction = `b_type_c1:context_c1`)
## Warning: Method 'posterior_samples' is deprecated. Please see ?as_draws for
## recommended alternatives.

Plot the posterior:

post_p <- good_bad_samples %>% 
  ggplot(aes(x = interaction)) +
  stat_halfeye(fill = 'steelblue', alpha = 0.8) +
  geom_vline(xintercept = 0, linetype = 'dashed')

post_p <- post_p +
  xlab('Interaction coefficient') +
  ylab('Probability density') +
  coord_cartesian(y = c(0, 1.0), clip = 'off') +
  scale_y_continuous(breaks = seq(0, 2, 0.5))

post_p <- post_p +
  theme_classic() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title.x = element_text(face = 'bold', size = 14,
                                    margin = margin(t = 10)),
        axis.title.y = element_text(face = 'bold', size = 16,
                                    margin = margin(r = 12)))

post_p

Change/improve analysis

First, on add versus subtract, since they were our original pair, as specified in Table 1. So let’s get rid of removing. Let’s also create a second version that has add versus remove. We do this — thereby deviating from our originally specified diagnostic words in Table 1 — because we expect the cloze probability of the verb “subtract” to be much lower simply because of its frequency. We know this from the word frequency result, and so to counteract this, we can use the more colloquial “remove”, which is much more frequent than “subtract”.

add_subtract <- filter(suggest,
                       cloze_verb != 'removing')

add_remove <- filter(suggest,
                       cloze_verb != 'subtracting')

Calculate averages:

add_remove %>% 
  filter(synonym_set == 'change') %>% 
  group_by(cloze_verb) %>% 
  summarize(M = mean(verb_logprob))
## # A tibble: 2 × 2
##   cloze_verb     M
##   <chr>      <dbl>
## 1 adding     -2.80
## 2 removing   -3.83
add_remove %>% 
  filter(synonym_set == 'improve') %>% 
  group_by(cloze_verb) %>% 
  summarize(M = mean(verb_logprob))
## # A tibble: 2 × 2
##   cloze_verb     M
##   <chr>      <dbl>
## 1 adding     -2.21
## 2 removing   -4.83

Calculate Cohen’s d for both comparisons, separately for change and improve verbs:

# Add versus subtract, synonym set for change, then improve:

cohen.d(verb_logprob ~ cloze_verb | Subject(verb),
        data = filter(add_subtract, synonym_set == 'change'), paired = TRUE)
## 
## Cohen's d
## 
## d estimate: 4.415499 (large)
## 95 percent confidence interval:
##    lower    upper 
## 1.186085 7.644914
cohen.d(verb_logprob ~ cloze_verb | Subject(verb),
        data = filter(add_subtract, synonym_set == 'improve'), paired = TRUE)
## 
## Cohen's d
## 
## d estimate: 10.61465 (large)
## 95 percent confidence interval:
##     lower     upper 
##  4.156977 17.072326
# Add versus remove, synonym set for change, then improve:

cohen.d(verb_logprob ~ cloze_verb | Subject(verb),
        data = filter(add_remove, synonym_set == 'change'), paired = TRUE)
## 
## Cohen's d
## 
## d estimate: 1.314024 (large)
## 95 percent confidence interval:
##     lower     upper 
## 0.6141472 2.0139006
cohen.d(verb_logprob ~ cloze_verb | Subject(verb),
        data = filter(add_remove, synonym_set == 'improve'), paired = TRUE)
## 
## Cohen's d
## 
## d estimate: 3.269203 (large)
## 95 percent confidence interval:
##     lower     upper 
## 0.6879813 5.8504250

Get the change values to put on the y-axis:

change_probs <- filter(add_remove,
                       synonym_set == 'change',
                       cloze_verb == 'adding') %>% pull(verb_prob)

change_words <- filter(add_remove,
                       synonym_set == 'change',
                       cloze_verb == 'adding') %>% pull(verb)

Hand-adjust certain values (they are internally on the percentage scale):

change_probs[4] <- change_probs[4] + 0.13 # moderate
change_probs[6] <- change_probs[6] - 0.13 # reform
change_probs[11] <- change_probs[11] + 0.25 # transform
change_probs[8] <- change_probs[8] - 0.25 # reorganize
change_probs[2] <- change_probs[2] + 0.35 # adjust
change_probs[7] <- change_probs[7] - 0.35 # remodel

Make the plot:

# Define plot and aesthetics:

change_p <- filter(add_remove, synonym_set == 'change') %>% 
  ggplot(aes(x = cloze_verb, y = verb_prob, fill = cloze_verb, group = verb))

# Add geoms:

change_p <- change_p +
  geom_line(col = 'grey') +
  geom_point(size = 3, shape = 21, alpha = 0.85) +
  annotate(geom = 'text',
           x = rep(0.9, length(change_probs)),
           y = change_probs,
           label = change_words,
           hjust = 1,
           col = 'grey33')

# Add scales and axes labels:

change_p <- change_p +
  coord_cartesian(xlim = c(0.5, 1.85)) +
  scale_y_continuous(breaks = seq(0, 30, 10),
                     labels = seq(0, 0.30, 0.10),
                     limits = c(0, 30)) +
  scale_fill_manual(values = c("#E69F00", "#0072B2")) +
  xlab(NULL) +
  ylab('Cloze probability')

# Add themes:

change_p <- change_p +
  theme_classic() +
  theme(legend.position = 'none') +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 15,
                                                    b = 0, l = 0),
                                    size = 14, face = 'bold'),
        axis.text.x = element_text(face = 'bold', size = 10),
        plot.title = element_text(face = 'bold'))

# Show in markdown:

change_p

# Save:

ggsave(plot = change_p,
       filename = '../figures/pdf/GPT3_change.pdf',
       width = 3, height = 4)
ggsave(plot = change_p,
       filename = '../figures/png/GPT3_change.png',
       width = 3, height = 4)
ggsave(plot = change_p,
       filename = '../figures/tiff/GPT3_change.tiff',
       width = 3, height = 4)

Get the improve values to put on the y-axis:

improve_probs <- filter(add_remove,
                        synonym_set == 'improve',
                        cloze_verb == 'adding') %>% pull(verb_prob)

improve_words <- filter(add_remove,
                        synonym_set == 'improve',
                        cloze_verb == 'adding') %>% pull(verb)

Hand-adjust certain values (they are internally on the percentage scale):

improve_probs[9] <- improve_probs[9] + 0.5 # upgrade
improve_probs[1] <- improve_probs[1] - 0.5 # improve

Make the plot:

# Define plot and aesthetics:

improve_p <- filter(add_remove, synonym_set == 'improve') %>% 
  ggplot(aes(x = cloze_verb, y = verb_prob, fill = cloze_verb, group = verb))

# Add geoms:

improve_p <- improve_p +
  geom_line(col = 'grey') +
  geom_point(size = 3, shape = 21, alpha = 0.85) +
  annotate(geom = 'text',
           x = rep(0.9, length(improve_probs)),
           y = improve_probs,
           label = improve_words,
           hjust = 1,
           col = 'grey33')

# Add scales and axes labels:

improve_p <- improve_p +
  coord_cartesian(xlim = c(0.5, 1.7)) +
  scale_y_continuous(breaks = seq(0, 30, 10),
                     labels = seq(0, 0.3, 0.10),
                     limits = c(0, 30)) +
  scale_fill_manual(values = c("#E69F00", "#0072B2")) +
  xlab(NULL) +
  ylab('Cloze probability')

# Add themes:

improve_p <- improve_p +
  theme_classic() +
  theme(legend.position = 'none') +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 15,
                                                    b = 0, l = 0),
                                    size = 14, face = 'bold'),
        axis.text.x = element_text(face = 'bold', size = 10),
        plot.title = element_text(face = 'bold'))

# Show in markdown:

improve_p

# Save:

ggsave(plot = improve_p,
       filename = '../figures/pdf/GPT3_improve.pdf',
       width = 3, height = 4)
ggsave(plot = improve_p,
       filename = '../figures/png/GPT3_improve.png',
       width = 3, height = 4)
ggsave(plot = improve_p,
       filename = '../figures/tiff/GPT3_improve.tiff',
       width = 3, height = 4)

Add titles and get rid of second y-axis:

change_p <- change_p +
  ggtitle('(a) Synonyms of "to change"')

improve_p <- improve_p +
  ggtitle('(b) Synonyms of "to improve"') +
  ylab(NULL)

Put both together using patchwork:

both_p <- change_p + plot_spacer() + improve_p + plot_spacer() +
  plot_layout(widths = c(3, 0.5, 3, 1))

# Show and save:

both_p

ggsave(plot = both_p, filename = '../figures/pdf/GPT3_change_improve_double_plot.pdf',
       width = 7, height = 4)
ggsave(plot = both_p, filename = '../figures/png/GPT3_change_improve_double_plot.png',
       width = 7, height = 4)
ggsave(plot = both_p, filename = '../figures/tiff/GPT3_change_improve_double_plot.tiff',
       width = 7, height = 4)

Change/improve Bayesian model

Perform the model with remove, which is the more conservative choice here. Let’s do this first in separate models, then in a model that has the interaction. Let’s do Bayesian paired t-tests again so that we can avoid random effects (since we would have to fit random slopes but only have two data points per verb), that is, we first compute difference scores.

# Get subsets:

change_subset <- filter(add_remove, synonym_set == 'change')
improve_subset <- filter(add_remove, synonym_set == 'improve')

# Calculate difference scores:

n_verbs <- length(unique(change_subset$verb))
change_diffs <- change_subset[rep(c(TRUE, FALSE), times = n_verbs), ]$verb_logprob - # adding logprob
  change_subset[rep(c(FALSE, TRUE), times = n_verbs), ]$verb_logprob # removing logprob

n_verbs <- length(unique(improve_subset$verb))
improve_diffs <- improve_subset[rep(c(TRUE, FALSE), times = n_verbs), ]$verb_logprob - # adding logprob
  improve_subset[rep(c(FALSE, TRUE), times = n_verbs), ]$verb_logprob # removing logprob

# Put them back into tables:

change_diffs <- tibble(logprob_diff = change_diffs,
                       verb = unique(change_subset$verb))

improve_diffs <- tibble(logprob_diff = improve_diffs,
                        verb = unique(improve_subset$verb))

Define weakly informative priors. So, these are differences scores of log probabilities. This means that we need to take the base line into account (how low or high the probabilities are overall), as well as as the difference. The probabilities are all quite low, so let’s make a probability of 0.01 (= 1%) our starting point.

log(0.01) # log value corresponding to 0.01 probability (= 1%)
## [1] -4.60517

From this log value, if we assumed a difference of log +/-1, what probabilities would this give us for 68% and 95% of the normal distribution?

c(log(0.01) - 1, log(0.01) + 1) # logged values for 68%
## [1] -5.60517 -3.60517
exp(c(log(0.01) - 1, log(0.01) + 1)) # probability values for 68%
## [1] 0.003678794 0.027182818

That’s perhaps a bit low. It’s hard to intuit this, but in the data we’ve seen differences of as high as 20%, given that some of the subtract/remove cases have barely any probability at all. So let’s take the prior twice as wide:

exp(c(log(0.01) - 1.5, log(0.01) + 1.5)) # probability values for 68%
## [1] 0.002231302 0.044816891
exp(c(log(0.01) - 3, log(0.01) + 3)) # probability values for 95%
## [1] 0.0004978707 0.2008553692

This seems reasonable.

This would suggest that a Normal(0, 2) prior might be good and already very conservative.

weak_prior <- c(prior('normal(0, 1.5)', class = 'Intercept'))

Use the difference scores in separate models for change and improve synonyms:

change_mdl <- brm(logprob_diff ~ 1,
                  data = change_diffs,
                  
                  prior = weak_prior,

                  # MCMC settings:

                  seed = 42, chains = 4, cores = 4,
                  warmup = 2000, iter = 4000)
## Compiling Stan program...
## Start sampling
improve_mdl <- brm(logprob_diff ~ 1,
                   data = improve_diffs,
                  
                   prior = weak_prior,
                   
                   # MCMC settings:

                   seed = 42, chains = 4, cores = 4,
                   warmup = 2000, iter = 4000)
## Compiling Stan program...
## Start sampling

Posterior predictive checks:

pp_check(change_mdl, ndraws = 100)

pp_check(improve_mdl, ndraws = 100)

Looks good.

Show models:

change_mdl
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: logprob_diff ~ 1 
##    Data: change_diffs (Number of observations: 11) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     1.01      0.22     0.55     1.43 1.00     4490     4216
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.73      0.19     0.46     1.19 1.00     4723     4492
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
improve_mdl
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: logprob_diff ~ 1 
##    Data: improve_diffs (Number of observations: 9) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept     2.39      0.47     1.35     3.24 1.00     3230     3008
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     1.36      0.42     0.81     2.36 1.00     3641     3974
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Perform hypothesis tests:

# For change verb cloze probabilities:

hypothesis(change_mdl, 'Intercept < 0')
## Hypothesis Tests for class b:
##        Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (Intercept) < 0     1.01      0.22     0.65     1.35          0         0
##   Star
## 1     
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
hypothesis(change_mdl, 'Intercept > 0')
## Hypothesis Tests for class b:
##        Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (Intercept) > 0     1.01      0.22     0.65     1.35    2665.67         1
##   Star
## 1    *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
# For improve verb cloze probabilities:

hypothesis(improve_mdl, 'Intercept < 0')
## Hypothesis Tests for class b:
##        Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (Intercept) < 0     2.39      0.47     1.56     3.09          0         0
##   Star
## 1     
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
hypothesis(improve_mdl, 'Intercept > 0')
## Hypothesis Tests for class b:
##        Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (Intercept) > 0     2.39      0.47     1.56     3.09       3999         1
##   Star
## 1    *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.

Plot posterior distributions for this. First for change verbs:

# Plot basics:

change_post_p <- posterior_samples(change_mdl) %>% 
  ggplot(aes(x = b_Intercept)) +
  stat_halfeye(fill = 'steelblue', alpha = 0.8) +
  geom_vline(xintercept = 0, linetype = 'dashed')
## Warning: Method 'posterior_samples' is deprecated. Please see ?as_draws for
## recommended alternatives.
# Axes and labels:

change_post_p <- change_post_p +
  xlab('Log probability difference\n(addition - subtraction)') +
  ylab('Probability density') +
  coord_cartesian(y = c(0, 1.1)) +
  scale_y_continuous(breaks = seq(0, 1, 0.25))

# Cosmetics:

change_post_p <- change_post_p +
  theme_classic() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title.x = element_text(face = 'bold', size = 14,
                                    margin = margin(t = 10)),
        axis.title.y = element_text(face = 'bold', size = 16,
                                    margin = margin(r = 12)))

# Show:

change_post_p

Then the improve_mdl posteriors:

# Plot basics:

improve_post_p <- posterior_samples(improve_mdl) %>% 
  ggplot(aes(x = b_Intercept)) +
  stat_halfeye(fill = 'steelblue', alpha = 0.8) +
  geom_vline(xintercept = 0, linetype = 'dashed')
## Warning: Method 'posterior_samples' is deprecated. Please see ?as_draws for
## recommended alternatives.
# Axes and labels:

improve_post_p <- improve_post_p +
  xlab('Log probability difference\n(addition - subtraction)') +
  ylab('Probability density') +
  coord_cartesian(y = c(0, 1.1)) +
  scale_y_continuous(breaks = seq(0, 1, 0.25))

# Cosmetics:

improve_post_p <- improve_post_p +
  theme_classic() +
  theme(axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        axis.title.x = element_text(face = 'bold', size = 14,
                                    margin = margin(t = 10)),
        axis.title.y = element_text(face = 'bold', size = 16,
                                    margin = margin(r = 12)))

# Show:

improve_post_p

Show both posterior distributions:

# First plot axes and title:

change_post_p <- change_post_p +
  ggtitle('(a) Change words') + 
  theme(plot.title = element_text(size = 14, face = 'bold'),
        plot.title.position = 'plot')

# Second plot axes and title:

improve_post_p <- improve_post_p +
  ylab(NULL) + 
  ggtitle('(b) Improve words') +
  theme(plot.title = element_text(size = 14, face = 'bold'),
        plot.title.position = 'plot')

# Combine:
 
both_p <- change_post_p + improve_post_p

# Show and save:

both_p

ggsave(plot = both_p,
       filename = '../figures/pdf/GTP3_change_improve_posteriors.pdf',
       width = 10, height = 4)
ggsave(plot = both_p,
       filename = '../figures/png/GTP3_change_improve_posteriors.png',
       width = 10, height = 4)
ggsave(plot = both_p,
       filename = '../figures/tiff/GTP3_change_improve_posteriors.tiff',
       width = 10, height = 4)

Combined improve/change Bayesian model

In this section, we’ll create a model that tests whether the effect size is bigger for improve than change verbs.

Bind both difference score tables together for analysis:

both_diffs <- bind_rows(change_diffs, improve_diffs)

# Identifier for synonym set:

both_diffs$type <- c(rep('change', times = nrow(change_diffs)),
                     rep('improve', times = nrow(improve_diffs)))

# Show:

both_diffs
## # A tibble: 20 × 3
##    logprob_diff verb       type   
##           <dbl> <chr>      <chr>  
##  1       1.44   change     change 
##  2       1.71   adjust     change 
##  3      -0.0100 convert    change 
##  4       1.48   moderate   change 
##  5       1.84   modify     change 
##  6       1.02   reform     change 
##  7       1.03   remodel    change 
##  8       0.500  reorganize change 
##  9       0.54   restyle    change 
## 10       1.58   revise     change 
## 11       0.200  transform  change 
## 12       2.7    improve    improve
## 13       1.43   ameliorate improve
## 14       1.68   amend      improve
## 15       3.79   augment    improve
## 16       2.14   better     improve
## 17       4.2    embellish  improve
## 18       3.98   enhance    improve
## 19       1.03   mend       improve
## 20       2.6    upgrade    improve

Make the model (we’ll use flat priors on coefficients here for simplicity):

comparison_mdl <- brm(logprob_diff ~ 1 + type,
                      data = both_diffs,
                   
                      # MCMC settings:

                      seed = 42, chains = 4, cores = 4,
                      warmup = 2000, iter = 4000)
## Compiling Stan program...
## Start sampling

Posterior predictive checks:

pp_check(comparison_mdl, ndraws = 100)

Show model output:

comparison_mdl
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: logprob_diff ~ 1 + type 
##    Data: both_diffs (Number of observations: 20) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Population-Level Effects: 
##             Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept       1.02      0.30     0.43     1.60 1.00     6342     5024
## typeimprove     1.60      0.45     0.71     2.48 1.00     6164     4877
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.97      0.18     0.70     1.40 1.00     6147     5450
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Perform hypothesis test of whether improve verbs show a bigger difference between add and remove:

hypothesis(comparison_mdl, 'typeimprove > 0')
## Hypothesis Tests for class b:
##          Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (typeimprove) > 0      1.6      0.45     0.87     2.33    1332.33         1
##   Star
## 1    *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
hypothesis(comparison_mdl, 'typeimprove < 0')
## Hypothesis Tests for class b:
##          Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (typeimprove) < 0      1.6      0.45     0.87     2.33          0         0
##   Star
## 1     
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.

This completes this analysis.